Comparison of including or excluding 2D variables


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
pd.__version__ # need 0.14.0 for multiindex slicing


Populating the interactive namespace from numpy and matplotlib
Out[1]:
'0.14.1'

Read files


In [2]:
oe = pd.read_table("overall_statistics_3d.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack().loc(axis=0)[6:10,160:200]
ve = pd.read_table("variable_statistics_3d.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack().loc(axis=0)[6:10,160:200]

In [3]:
oi = pd.read_table("overall_statistics_ksmall.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack().loc(axis=0)[6:10,160:200]
vi = pd.read_table("variable_statistics_ksmall.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack().loc(axis=0)[6:10,160:200]

Add fixed compression ratios


In [4]:
N_c = 88*30 # for 3D variables, vertical stacking
N_d = 48602 # for 3D variables, vertical stacking
original_size = N_c * N_d
compressed_size = lambda K, M: N_d + N_c * K + N_d * M + N_c * K * M
oe["compression_ratio_fixed"] = compressed_size(np.array(oe.index.get_level_values("K")),np.array(oe.index.get_level_values("M"))) / original_size
#oe.loc[:,"compression_ratio_fixed"].unstack("K")

In [5]:
N_c = 3008  # for all variables, vertical stacking
N_d = 48602 # for all variables, vertical stacking
original_size = N_c * N_d
compressed_size = lambda K, M: N_d + N_c * K + N_d * M + N_c * K * M
oi["compression_ratio_fixed"] = compressed_size(np.array(oi.index.get_level_values("K")),np.array(oi.index.get_level_values("M"))) / original_size
#oi.loc[:,"compression_ratio_fixed"].unstack("K")

Compare errors and compression ratio


In [24]:
# rms error vs compression ratio, one line per K
grouped_e = ve.loc(axis=0)[7:10,:].mean(axis=1,level="STATISTIC").join(oe).reset_index().groupby("K")
grouped_i = vi.loc(axis=0)[7:10,:].mean(axis=1,level="STATISTIC").join(oi).reset_index().groupby("K")
for key,grp in grouped_e:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],"--", label="K = " + str(key) + " (excl.)")
for key,grp in grouped_i:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],"-", label="K = " + str(key) + " (incl.)")
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
#plt.title("error vs compression ratio, by K")
#plt.xlim((0.07,0.15))
plt.ylim((0.001,0.0025))


Out[24]:
(0.001, 0.0025)

In [26]:
# max error vs compression ratio, one line per K
grouped_e = ve.loc(axis=0)[7:10,:].mean(axis=1,level="STATISTIC").join(oe).reset_index().groupby("K")
grouped_i = vi.loc(axis=0)[7:10,:].mean(axis=1,level="STATISTIC").join(oi).reset_index().groupby("K")
for key,grp in grouped_e:
    plt.plot(grp["compression_ratio_fixed"],grp["max_error"],"--", label="K = " + str(key) + " (excl.)")
for key,grp in grouped_i:
    plt.plot(grp["compression_ratio_fixed"],grp["max_error"],"-", label="K = " + str(key) + " (incl.)")
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean max error")
#plt.title("error vs compression ratio, by K")
plt.xlim((0.075,0.14))
plt.ylim((0.035,0.085))


Out[26]:
(0.035, 0.085)

In [7]:
for key,grp in grouped_e:
    plt.plot(grp["compression_ratio_fixed"],grp["L_final"],"-", label="K = " + str(key) + " (excl.)")
for key,grp in grouped_i:
    plt.plot(grp["compression_ratio_fixed"],grp["L_final"],":", label="K = " + str(key) + " (incl.)")
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("final L value")
#plt.title("error vs compression ratio, by K")
plt.xlim((0.07,0.15))
#plt.ylim((0.001,0.002))


Out[7]:
(0.07, 0.15)

Error ranking for 2D/3D variables


In [8]:
# load variable information for joining levels to variables
v_info = pd.read_table("variable_information.txt").set_index(["VARIABLE","INFO"]).unstack().loc[:,"VALUE"]
v_info["levels"] = v_info["levels"].astype("int")
v_info.columns.name = ""

In [9]:
v_ranked_rms = vi.mean(axis=0).unstack().sort("rms_error", ascending=False).join(v_info)["levels"].reset_index().reset_index()
v_ranked_max = vi.mean(axis=0).unstack().sort("max_error", ascending=False).join(v_info)["levels"].reset_index().reset_index()

In [10]:
v_ranked_rms[v_ranked_rms.levels < 30]["index"].hist(bins=range(0,220,20))
plt.xlabel("rank when ordered by rms error (larger is better)")
plt.ylabel("number of variables")
plt.xlim((0,200))
plt.ylim((0,20))
print("Percentage in upper half:", 100*len(v_ranked_rms[(v_ranked_rms.levels < 30) & (v_ranked_rms.index >= 93)]) / len(v_ranked_rms[v_ranked_rms.levels < 30]))


Percentage in upper half: 43.82022471910113

In [11]:
v_ranked_max[v_ranked_max.levels < 30]["index"].hist(bins=range(0,220,20))
plt.xlabel("rank when ordered by maximum error (larger is better)")
plt.ylabel("number of variables")
plt.xlim((0,200))
plt.ylim((0,20))
print("Percentage in upper half:", 100*len(v_ranked_max[(v_ranked_max.levels < 30) & (v_ranked_max.index >= 93)]) / len(v_ranked_max[v_ranked_max.levels < 30]))


Percentage in upper half: 75.28089887640449